林嶔 (Lin, Chin)
Lesson 7
– 過度擬合問題目前已經學習到的可行解法包含:小批量、資料擴增、正則化、Dropout
– 梯度消失問題目前已經學習到的可行解法包含:ReLU、Batch Normalization
– 權重初始化問題我們尚未面對,目前能想到的僅透過強大的優化器(SGD、Adam)來避免陷入局部極值。
– 這個現象告訴我們,我們如果有辦法提供給較淺的層一些梯度,那之後當每一層都有足夠大的梯度時整個網路就會開始往好的方向走了。
– Google研究院在當年度訓練出了一個22層深的網路GoogLeNet(又稱Inception Net),是當時比賽中最深的一個網路,而他也順利的以93.3%的準確度於當年奪冠,隨後他們於2015年所發表的研究:Going Deeper with Convolutions提到了他們如何實現這個網路的細節想法。
– 讓我們把這個思路實現在前面幾節課沒辦法處理的IRIS資料集深層網路的優化。
my.model.FeedForward.create = function (Iterator, ctx = mx.cpu(), save.grad = FALSE,
loss_symbol, pred_symbol,
Optimizer, num_round = 30) {
require(abind)
out_round <- unique(c(1:5, round(quantile(1:num_round, 1:30/30))))
#0. Check data shape
Iterator$reset()
Iterator$iter.next()
my_values <- Iterator$value()
input_shape <- lapply(my_values, dim)
batch_size <- tail(input_shape[[1]], 1)
#1. Build an executor to train model
exec_list = list(symbol = loss_symbol, ctx = ctx, grad.req = "write")
exec_list = append(exec_list, input_shape)
my_executor = do.call(mx.simple.bind, exec_list)
#2. Set the initial parameters
mx.set.seed(0)
new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
input.shape = input_shape,
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = ctx)
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
#3. Define the updater
my_updater = mx.opt.get.updater(optimizer = Optimizer, weights = my_executor$ref.arg.arrays)
#4. Forward/Backward
message('Start training:')
set.seed(0)
if (save.grad) {epoch_grad = NULL}
for (i in 1:num_round) {
Iterator$reset()
batch_loss = list()
if (save.grad) {batch_grad = list()}
batch_seq = 0
t0 = Sys.time()
while (Iterator$iter.next()) {
my_values <- Iterator$value()
mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
mx.exec.forward(my_executor, is.train = TRUE)
mx.exec.backward(my_executor)
update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
batch_loss[[length(batch_loss) + 1]] = as.array(my_executor$ref.outputs[[1]])
if (save.grad) {
grad_list = sapply(my_executor$ref.grad.arrays, function (x) {if (!is.null(x)) {mean(abs(as.array(x)))}})
grad_list = unlist(grad_list[grepl('weight', names(grad_list), fixed = TRUE) & !grepl('out', names(grad_list), fixed = TRUE)])
batch_grad[[length(batch_grad) + 1]] = grad_list
}
batch_seq = batch_seq + 1
}
if (i %in% out_round) {
message(paste0("epoch = ", i,
": loss = ", formatC(mean(unlist(batch_loss)), format = "f", 4),
" (Speed: ", formatC(batch_seq * batch_size/as.numeric(Sys.time() - t0, units = 'secs'), format = "f", 2), " sample/secs)"))
}
if (save.grad) {epoch_grad = rbind(epoch_grad, apply(abind(batch_grad, along = 2), 1, mean))}
}
if (save.grad) {
epoch_grad[epoch_grad < 1e-8] = 1e-8
COL = rainbow(ncol(epoch_grad))
random_pos = 2^runif(ncol(epoch_grad), -0.5, 0.5)
plot(epoch_grad[,1] * random_pos[1], type = 'l', col = COL[1],
xlab = 'epoch', ylab = 'mean of abs(grad)', log = 'y',
ylim = range(epoch_grad))
for (i in 2:ncol(epoch_grad)) {lines(1:nrow(epoch_grad), epoch_grad[,i] * random_pos[i], col = COL[i])}
legend('topright', paste0('layer', 1:ncol(epoch_grad), '_weight'), col = COL, lwd = 1)
}
#5. Get model
my_model <- mxnet:::mx.model.extract.model(symbol = pred_symbol,
train.execs = list(my_executor))
return(my_model)
}
data(iris)
X.array = array(t(as.matrix(iris[,-5])), dim = c(4, 150))
Y.array = array(t(model.matrix(~ -1 + iris[,5])), dim = c(3, 150))
set.seed(0)
TRAIN.seq = sample(1:150, 100)
TRAIN.X.array = X.array[,TRAIN.seq]
TRAIN.Y.array = Y.array[,TRAIN.seq]
TEST.X.array = X.array[,-TRAIN.seq]
TEST.Y.array = Y.array[,-TRAIN.seq]
library(mxnet)
# Iterator
my_iterator_core = function(batch_size) {
batch = 0
batch_per_epoch = ncol(TRAIN.Y.array)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > ncol(TRAIN.Y.array)] = sample(1:ncol(TRAIN.Y.array), sum(idx > ncol(TRAIN.Y.array)))
data = mx.nd.array(array(TRAIN.X.array[,idx], dim = c(nrow(TRAIN.X.array), batch_size)))
label = mx.nd.array(array(TRAIN.Y.array[,idx], dim = c(nrow(TRAIN.Y.array), batch_size)))
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter = my_iterator_func(iter = NULL, batch_size = 20)
# Optimizer
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
# Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 3, name = 'fc4')
softmax_layer = mx.symbol.softmax(data = fc4, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 1000)
# Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc1_out = mx.symbol.FullyConnected(data = relu1, num.hidden = 3, name = 'fc1_out')
softmax1 = mx.symbol.softmax(data = fc1_out, axis = 1, name = 'softmax1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc2_out = mx.symbol.FullyConnected(data = relu2, num.hidden = 3, name = 'fc2_out')
softmax2 = mx.symbol.softmax(data = fc2_out, axis = 1, name = 'softmax2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc3_out = mx.symbol.FullyConnected(data = relu3, num.hidden = 3, name = 'fc3_out')
softmax3 = mx.symbol.softmax(data = fc3_out, axis = 1, name = 'softmax3')
softmax_layer = softmax1 * 0.01 + softmax2 * 0.01 + softmax3 * 0.98
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
loss_symbol = m_logloss, pred_symbol = softmax3,
Optimizer = my_optimizer, num_round = 1000)
# Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
softmax_list = list()
softmax_layer = 0
weights = c(0.01, 0.01, 0.98)
for (i in 1:3) {
if (i == 1) {
fc = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = paste0('fc', i))
} else {
fc = mx.symbol.FullyConnected(data = relu, num.hidden = 10, name = paste0('fc', i))
}
relu = mx.symbol.Activation(data = fc, act.type = 'relu', name = paste0('relu', i))
fc_out = mx.symbol.FullyConnected(data = relu, num.hidden = 3, name = paste0('fc', i, '_out'))
softmax_list[[i]] = mx.symbol.softmax(data = fc_out, axis = 1, name = paste0('softmax', i))
softmax_layer = softmax_layer + softmax_list[[i]] * weights[i]
}
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
loss_symbol = m_logloss, pred_symbol = softmax_list[[i]],
Optimizer = my_optimizer, num_round = 1000)
– 這個式子假定不存在bias項,這樣推導較為簡單。
\[ \begin{align} l_1 & = L(x,W^1) \\ h_1 & = ReLU(l_1) \\ out_1 & = L(h_1,W^{o_1}) \\ o_1 & = S(out_1) \\\\ l_2 & = L(h_1,W^2) \\ h_2 & = ReLU(l_2) \\ out_2 & = L(h_2,W^{o_2}) \\ o_2 & = S(out_2) \\\\ o & = \lambda o_1 + (1 - \lambda) o_2\\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
\[ \begin{align} l_1 & = L(x,W^1) \\ h_1 & = ReLU(l_1) \\ l_2 & = L(h_1,W^2) \\ h_2 & = ReLU(l_2) \\ out_2 & = L(h_2,W^{o_2}) \\ o = o_2 & = S(out_2) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
\[ \begin{align} grad.W^{o_2} & = \frac{\partial}{\partial W^{o_2}}loss = grad.out_2 \otimes \frac{\partial}{\partial W^{o_2}}out_2 = \frac{{1}}{n} \otimes (h_2)^T \bullet grad.out_2 \\ grad.W^2 & = \frac{\partial}{\partial W^2}loss = grad.l_2 \otimes \frac{\partial}{\partial W^{2}}l_2 = \frac{{1}}{n} \otimes (h_1)^T \bullet grad.l_2 \\ grad.W^1 & = \frac{\partial}{\partial W^1}loss = grad.l_1 \otimes \frac{\partial}{\partial W^{1}}l_1 = \frac{{1}}{n} \otimes (x)^T \bullet grad.l_1 \end{align} \]
\[ \begin{align} grad.W^{o_1} & = \frac{\partial}{\partial W^{o_1}}loss = grad.out_1 \otimes \frac{\partial}{\partial W^{o_1}}out_1 = \frac{{1}}{n} \otimes (h_1)^T \bullet grad.out_1 \\ grad.W^1 & = \frac{\partial}{\partial W^1}loss = grad.l_1 \otimes \frac{\partial}{\partial W^{1}}l_1 = \frac{{1}}{n} \otimes (x)^T \bullet grad.l_1 \end{align} \]
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
model$arg.params <- model$arg.params[!grepl('fc[1-2]_out' ,names(model$arg.params))]
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 13 0
## 3 0 2 17
會對預測造成一些干擾
各層之間的輸出在Loss function中的權重難以平衡
增加了運算資源的消耗
– 事實上一個更關鍵的突破在2015年的ILSVRC競賽出現,這個突破可以說是至今為止深度學習在理論上最重要的突破,獲勝團隊是由微軟亞洲研究院何愷明所領軍的團隊,他們發展出的ResNet將錯誤率降低至3.57%,大幅超越了2014年度的冠軍GoogleNet的6.7%以及人類平均的5.0%。
– 更值得一提的是,在所有人都被梯度消失問題所困擾的時刻,何愷明的團隊在2015年的ILSVRC中所提出的ResNet是一個1000層的網路,同一個時間幾乎沒有團隊有能力訓練超過50層的神經網路。
– 想當然耳,這個爆炸級的研究:Deep Residual Learning for Image Recognition在2016年的CVPR上發表後,理所當然的獲得了該研討會的最佳會議論文獎:
– 讓我們用數學式稍微描述一下,假設我們有一個雙隱藏層的MLP,那預測式在加入他的概念後會變成什麼樣子:
\[ \begin{align} l_1 & = L(x,W^1) \\ h_1 & = ReLU(l_1) \\ r_1 & = h_1 + x \\\\ l_2 & = L(r_1,W^2) \\ h_2 & = ReLU(l_2) \\ r_2 & = h_2 + r_1 \\\\ l_3 & = L(r_2,W^3) \\ o & = S(l_3) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
– 假使我們要改變維度時,那我們就必需放棄這一個連接手段。
\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.l_3 & = \frac{\partial}{\partial l_3}loss = grad.o \otimes \frac{\partial}{\partial l_3}o= o-y \\ grad.W^3 & = \frac{\partial}{\partial W^3}loss = grad.l_3 \otimes \frac{\partial}{\partial W^3}l_3 = \frac{{1}}{n} \otimes (r_2)^T \bullet grad.l_3\\ grad.r_2 & = \frac{\partial}{\partial r_2}loss = grad.l_3 \otimes \frac{\partial}{\partial r_2}l_3 = grad.l_3 \bullet (W^3)^T \\\\ grad.h_2 & = \frac{\partial}{\partial h_2}loss = grad.r_2 \otimes \frac{\partial}{\partial h_2}r_2 = grad.r_2 \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.h_2 \otimes \frac{\partial}{\partial l_2}h_2 = grad.h_2 \otimes \frac{\partial}{\partial l_2}ReLU(l_2) \\ grad.W^2 & = \frac{\partial}{\partial W^2}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2}l_2 = \frac{{1}}{n} \otimes (r_1)^T \bullet grad.l_2\\ grad.r_1 & = \frac{\partial}{\partial r_1}loss = grad.l_2 \otimes \frac{\partial}{\partial r_1}l_2 + grad.r_2 \otimes \frac{\partial}{\partial r_1} r_2 \\ & = grad.l_2 \bullet (W^2)^T + grad.r_2 \\\\ grad.h_1 & = \frac{\partial}{\partial h_2}loss = grad.r_1 \otimes \frac{\partial}{\partial h_1}r_1 = grad.r_1 \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.h_1 \otimes \frac{\partial}{\partial l_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial l_1}ReLU(l_1) \\ grad.W^1 & = \frac{\partial}{\partial W^1}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1}l_1 = \frac{{1}}{n} \otimes (x)^T \bullet grad.l_1 \\ grad.x & = \frac{\partial}{\partial x}loss = grad.l_1 \otimes \frac{\partial}{\partial x}l_1 + grad.r_1 \otimes \frac{\partial}{\partial x} r_1 = grad.l_1 \bullet (W^1)^T + grad.r_1 \\ & = grad.l_1 \bullet (W^1)^T + grad.l_2 \bullet (W^2)^T + grad.r_2 \end{align} \]
– 因為每一層\(r\)的梯度都包含最頂層的值,所以梯度消失問題迎刃而解!這樣自然可以訓練一個1000層深的網路而不會發生梯度消失問題。
– 你可以稍微想一下,這樣一個1000層的網路似乎失去了生物學上的意義,那這樣的模型還會有預測效果嗎?
– 讓我們展開預測式來看看它到底是長什麼樣子:
\[ \begin{align} l_1 & = L(x,W^1) = xW^1\\ h_1 & = ReLU(l_1) \\ r_1 & = h_1 + x \\\\ l_2 & = L(r_1,W^2) = r_1W^2 = (h_1 + x)W^2 \\ h_2 & = ReLU(l_2) \\ r_2 & = h_2 + r_1 \\\\ l_3 & = L(r_2,W^3) = r_2W^3 = (h_2 + h_1 + x)W^3 \\ o & = S(l_3) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
\[ \begin{align} l_3 & = (h_2 + h_1 + x)W^3 \\ & = (ReLU(l_2) + ReLU(l_1) + x)W^3 \\ & = (ReLU((h_1 + x)W^2) + ReLU(xW^1) + x)W^3 \\ & = (ReLU((ReLU(xW^1) + x)W^2) + ReLU(xW^1) + x)W^3 \end{align} \]
– 讓我們直接試試看之前用SGD絕對不可能優化成功的6層網路訓練:
# Optimizer
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
#Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 3, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 3, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
plus2 = mx.symbol.broadcast_plus(lhs = relu2, rhs = relu1, name = 'plus2')
fc3 = mx.symbol.FullyConnected(data = plus2, num.hidden = 3, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
plus3 = mx.symbol.broadcast_plus(lhs = relu3, rhs = plus2, name = 'plus3')
fc4 = mx.symbol.FullyConnected(data = plus3, num.hidden = 3, name = 'fc4')
relu4 = mx.symbol.Activation(data = fc4, act.type = 'relu', name = 'relu4')
plus4 = mx.symbol.broadcast_plus(lhs = relu4, rhs = plus3, name = 'plus4')
fc5 = mx.symbol.FullyConnected(data = plus4, num.hidden = 3, name = 'fc5')
relu5 = mx.symbol.Activation(data = fc5, act.type = 'relu', name = 'relu5')
plus5 = mx.symbol.broadcast_plus(lhs = relu5, rhs = plus4, name = 'plus5')
fc6 = mx.symbol.FullyConnected(data = plus5, num.hidden = 3, name = 'fc6')
relu6 = mx.symbol.Activation(data = fc6, act.type = 'relu', name = 'relu6')
plus6 = mx.symbol.broadcast_plus(lhs = relu6, rhs = plus5, name = 'plus6')
fc7 = mx.symbol.FullyConnected(data = plus6, num.hidden = 3, name = 'fc7')
softmax_layer = mx.symbol.softmax(data = fc7, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
# Predicting
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 13 0
## 3 0 2 17
PARAMS = model$arg.params
ls(PARAMS)
## [1] "fc1_bias" "fc1_weight" "fc2_bias" "fc2_weight" "fc3_bias"
## [6] "fc3_weight" "fc4_bias" "fc4_weight" "fc5_bias" "fc5_weight"
## [11] "fc6_bias" "fc6_weight" "fc7_bias" "fc7_weight"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
## [,1]
## [1,] 0.936310470
## [2,] 0.061589610
## [3,] 0.002099835
PARAMS = model$arg.params
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
relu1_out = fc1_out
relu1_out[relu1_out < 0] = 0
fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
relu2_out = fc2_out
relu2_out[relu2_out < 0] = 0
plus2_out = relu2_out + relu1_out
fc3_out = plus2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)
relu3_out = fc3_out
relu3_out[relu3_out < 0] = 0
plus3_out = relu3_out + plus2_out
fc4_out = plus3_out %*% as.array(PARAMS$fc4_weight) + as.array(PARAMS$fc4_bias)
relu4_out = fc4_out
relu4_out[relu4_out < 0] = 0
plus4_out = relu4_out + plus3_out
fc5_out = plus4_out %*% as.array(PARAMS$fc5_weight) + as.array(PARAMS$fc5_bias)
relu5_out = fc5_out
relu5_out[relu5_out < 0] = 0
plus5_out = relu5_out + plus4_out
fc6_out = plus5_out %*% as.array(PARAMS$fc6_weight) + as.array(PARAMS$fc6_bias)
relu6_out = fc6_out
relu6_out[relu6_out < 0] = 0
plus6_out = relu6_out + plus5_out
fc7_out = plus6_out %*% as.array(PARAMS$fc7_weight) + as.array(PARAMS$fc7_bias)
Softmax_out = exp(fc7_out)/sum(exp(fc7_out))
cbind(t(Softmax_out), preds)
## [,1] [,2]
## [1,] 0.936310549 0.936310470
## [2,] 0.061589616 0.061589610
## [3,] 0.002099835 0.002099835
# Optimizer
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
#Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 3, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 3, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
plus = mx.symbol.broadcast_plus(lhs = relu2, rhs = relu1, name = 'plus2')
for (i in 3:20) {
fc = mx.symbol.FullyConnected(data = plus, num.hidden = 3, name = paste0('fc', i))
relu = mx.symbol.Activation(data = fc, act.type = 'relu', name = paste0('relu', i))
plus = mx.symbol.broadcast_plus(lhs = relu, rhs = plus, name = paste0('plus', i))
}
fc_final = mx.symbol.FullyConnected(data = plus, num.hidden = 3, name = paste0('fc', i + 1))
softmax_layer = mx.symbol.softmax(data = fc_final, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 200)
# Predicting
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 14 0
## 3 0 1 17
– 100層甚至是1000層理論上都是可行的,但要注意目前你會面對的問題是「梯度爆炸」,所以你可能需要調整一下SGD的學習率,或是使用Adam!
– 而在卷積神經網路中,隨著卷積器的運算產生的特徵圖,通常會比原始的輸入圖來的小,那這該怎麼辦呢?
– 這邊要引入一個新的東西叫做Padding,他是將原始輸入的外圍補上0,從而實現卷積後與卷積前的輸出相等:
– 請在這裡下載MNIST的手寫數字資料,而在上週我們已經利用這個程式碼將其切割成兩份了:
library(data.table)
DAT = fread("data/MNIST.csv", data.table = FALSE)
DAT = data.matrix(DAT)
#Split data
set.seed(0)
Train.sample = sample(1:nrow(DAT), nrow(DAT)*0.6, replace = FALSE)
Train.X = DAT[Train.sample,-1]
Train.Y = DAT[Train.sample,1]
Test.X = DAT[-Train.sample,-1]
Test.Y = DAT[-Train.sample,1]
#Write
fwrite(x = data.table(cbind(Train.Y, Train.X)),
file = 'data/train_data.csv',
col.names = FALSE, row.names = FALSE)
fwrite(x = data.table(cbind(Test.Y, Test.X)),
file = 'data/test_data.csv',
col.names = FALSE, row.names = FALSE)
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "data.csv", "data.shape", "batch.size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, data.csv, data.shape, batch.size){
csv_iter <- mx.io.CSVIter(data.csv = data.csv, data.shape = data.shape, batch.size = batch.size)
.self$iter <- csv_iter
.self
},
value = function(){
val <- as.array(.self$iter$value()$data)
val.x <- val[-1,]
val.y <- t(model.matrix(~ -1 + factor(val[1,], levels = 0:9)))
val.y <- array(val.y, dim = c(10, ncol(val.x)))
dim(val.x) <- c(28, 28, 1, ncol(val.x))
val.x <- mx.nd.array(val.x)
val.y <- mx.nd.array(val.y)
list(data=val.x, label=val.y)
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter = my_iterator_func(iter = NULL, data.csv = 'data/train_data.csv', data.shape = 785, batch.size = 20)
my_optimizer = mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
epsilon = 1e-08, wd = 0)
– 你可以不斷的使用函數「mx.symbol.infer.shape」來確認目前的維度,以確認Padding的效果:
# input
data <- mx.symbol.Variable('data')
# first conv (Don't need residual learning)
conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=10, name = 'conv1')
relu1 <- mx.symbol.Activation(data=conv1, act_type="relu")
pool1 <- mx.symbol.Pooling(data=relu1, pool_type="max", kernel=c(2,2), stride=c(2,2))
# second conv
conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(3,3), pad=c(1,1), num_filter=10, name = 'conv2')
relu2 <- mx.symbol.Activation(data=conv2, act_type="relu")
plus2 <- mx.symbol.broadcast_plus(lhs = relu2, rhs = pool1)
# third conv
conv3 <- mx.symbol.Convolution(data=plus2, kernel=c(3,3), pad=c(1,1), num_filter=10, name = 'conv3')
relu3 <- mx.symbol.Activation(data=conv3, act_type="relu")
plus3 <- mx.symbol.broadcast_plus(lhs = relu3, rhs = plus2)
# forth conv
conv4 <- mx.symbol.Convolution(data=plus3, kernel=c(3,3), pad=c(1,1), num_filter=10, name = 'conv4')
relu4 <- mx.symbol.Activation(data=conv4, act_type="relu")
plus4 <- mx.symbol.broadcast_plus(lhs = relu4, rhs = plus3)
# Pool and out
pool2 <- mx.symbol.Pooling(data=plus4, pool_type="max", kernel=c(3,3), stride=c(3,3))
# first fullc
flatten <- mx.symbol.Flatten(data=pool2)
fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=150, name = 'fc1')
relu3 <- mx.symbol.Activation(data=fc1, act_type="relu")
# second fullc
fc2 <- mx.symbol.FullyConnected(data=relu3, num_hidden=10, name = 'fc2')
# Softmax
resnet <- mx.symbol.softmax(data = fc2, axis = 1, name = 'lenet')
# m-log loss
label = mx.symbol.Variable(name = 'label')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(resnet + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
– 這裡僅僅是個範例,請你自己創造更深的網路並增加優化的代數。
resnet_model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
loss_symbol = m_logloss, pred_symbol = resnet,
Optimizer = my_optimizer, num_round = 20)
library(data.table)
DAT = fread("data/test_data.csv", data.table = FALSE)
DAT = data.matrix(DAT)
Test.X = t(DAT[,-1])
dim(Test.X) = c(28, 28, 1, ncol(Test.X))
Test.Y = DAT[,1]
predict_Y = predict(resnet_model, Test.X)
confusion_table = table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.97875
print(confusion_table)
## Test.Y
## 0 1 2 3 4 5 6 7 8 9
## 1 1639 1 1 0 1 1 9 1 0 3
## 2 1 1831 4 1 0 0 6 0 2 1
## 3 0 3 1618 3 0 0 0 14 1 0
## 4 1 3 14 1728 0 41 1 6 5 11
## 5 0 6 1 0 1581 1 4 8 2 5
## 6 1 0 0 2 0 1470 10 1 4 2
## 7 0 0 1 0 2 1 1615 0 1 0
## 8 4 2 9 1 4 0 0 1697 0 3
## 9 8 3 7 4 1 12 15 6 1651 4
## 10 9 2 1 3 17 25 1 20 9 1613
每次維度做修正時將無法繼續使用
整個網路需要優化的參數相當的浪費
– 這個研究是由康乃爾大學的博士後研究員黄高、清華大學生劉壯、Facebook AI研究院的Laurens van der Maaten以及康乃爾大學的電腦科學教授 Kilian Q. Weinberger等人所發表,論文名稱為:Densely Connected Convolutional Networks
– 這個研究在2017年的CVPR上發表後(Residual Learning發表於2016年的CVPR),也成功獲得了該屆的最佳會議論文獎!
– 符號\(||\)代表矩陣的並聯:
\[ \begin{align} l_1 & = L(x,W^1) \\ h_1 & = ReLU(l_1) \\ r_1 & = h_1 || x \\\\ l_2 & = L(r_1,W^2) \\ h_2 & = ReLU(l_2) \\ r_2 & = h_2 || r_1 \\\\ l_3 & = L(r_2,W^3) \\ o & = S(l_3) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
– 在這裡梯度的數學推導需要全部展開才能做(並且涉及很多你可能沒有學過的數學符號),我們就不做了,直接用MxNet幫我們實現。
data(iris)
X.array = array(t(as.matrix(iris[,-5])), dim = c(4, 150))
Y.array = array(t(model.matrix(~ -1 + iris[,5])), dim = c(3, 150))
set.seed(0)
TRAIN.seq = sample(1:150, 100)
TRAIN.X.array = X.array[,TRAIN.seq]
TRAIN.Y.array = Y.array[,TRAIN.seq]
TEST.X.array = X.array[,-TRAIN.seq]
TEST.Y.array = Y.array[,-TRAIN.seq]
library(mxnet)
# Iterator
my_iterator_core = function(batch_size) {
batch = 0
batch_per_epoch = ncol(TRAIN.Y.array)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > ncol(TRAIN.Y.array)] = sample(1:ncol(TRAIN.Y.array), sum(idx > ncol(TRAIN.Y.array)))
data = mx.nd.array(array(TRAIN.X.array[,idx], dim = c(nrow(TRAIN.X.array), batch_size)))
label = mx.nd.array(array(TRAIN.Y.array[,idx], dim = c(nrow(TRAIN.Y.array), batch_size)))
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter = my_iterator_func(iter = NULL, batch_size = 20)
# Optimizer
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
#Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 3, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 4, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
concat2 = mx.symbol.concat(data = list(relu1, relu2), num.args = 2, dim = 1, name = 'concat2')
fc3 = mx.symbol.FullyConnected(data = concat2, num.hidden = 5, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
concat3 = mx.symbol.concat(data = list(concat2, relu3), num.args = 2, dim = 1, name = 'concat3')
fc4 = mx.symbol.FullyConnected(data = concat3, num.hidden = 6, name = 'fc4')
relu4 = mx.symbol.Activation(data = fc4, act.type = 'relu', name = 'relu4')
concat4 = mx.symbol.concat(data = list(concat3, relu4), num.args = 2, dim = 1, name = 'concat4')
fc5 = mx.symbol.FullyConnected(data = concat4, num.hidden = 7, name = 'fc5')
relu5 = mx.symbol.Activation(data = fc5, act.type = 'relu', name = 'relu5')
concat5 = mx.symbol.concat(data = list(concat4, relu5), num.args = 2, dim = 1, name = 'concat5')
fc6 = mx.symbol.FullyConnected(data = concat5, num.hidden = 8, name = 'fc6')
relu6 = mx.symbol.Activation(data = fc6, act.type = 'relu', name = 'relu6')
concat6 = mx.symbol.concat(data = list(concat5, relu6), num.args = 2, dim = 1, name = 'concat6')
fc7 = mx.symbol.FullyConnected(data = concat6, num.hidden = 3, name = 'fc7')
softmax_layer = mx.symbol.softmax(data = fc7, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
# Predicting
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 13 0
## 3 0 2 17
PARAMS = model$arg.params
ls(PARAMS)
## [1] "fc1_bias" "fc1_weight" "fc2_bias" "fc2_weight" "fc3_bias"
## [6] "fc3_weight" "fc4_bias" "fc4_weight" "fc5_bias" "fc5_weight"
## [11] "fc6_bias" "fc6_weight" "fc7_bias" "fc7_weight"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
## [,1]
## [1,] 9.990797e-01
## [2,] 9.203216e-04
## [3,] 4.921823e-17
PARAMS = model$arg.params
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
relu1_out = fc1_out
relu1_out[relu1_out < 0] = 0
fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
relu2_out = fc2_out
relu2_out[relu2_out < 0] = 0
plus2_out = cbind(relu1_out, relu2_out)
fc3_out = plus2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)
relu3_out = fc3_out
relu3_out[relu3_out < 0] = 0
plus3_out = cbind(plus2_out, relu3_out)
fc4_out = plus3_out %*% as.array(PARAMS$fc4_weight) + as.array(PARAMS$fc4_bias)
relu4_out = fc4_out
relu4_out[relu4_out < 0] = 0
plus4_out = cbind(plus3_out, relu4_out)
fc5_out = plus4_out %*% as.array(PARAMS$fc5_weight) + as.array(PARAMS$fc5_bias)
relu5_out = fc5_out
relu5_out[relu5_out < 0] = 0
plus5_out = cbind(plus4_out, relu5_out)
fc6_out = plus5_out %*% as.array(PARAMS$fc6_weight) + as.array(PARAMS$fc6_bias)
relu6_out = fc6_out
relu6_out[relu6_out < 0] = 0
plus6_out = cbind(plus5_out, relu6_out)
fc7_out = plus6_out %*% as.array(PARAMS$fc7_weight) + as.array(PARAMS$fc7_bias)
Softmax_out = exp(fc7_out)/sum(exp(fc7_out))
cbind(t(Softmax_out), preds)
## [,1] [,2]
## [1,] 9.990797e-01 9.990797e-01
## [2,] 9.203218e-04 9.203216e-04
## [3,] 4.921838e-17 4.921823e-17
– 一個比較常用的非線性轉換函數叫做LeakyReLU,而他的數學式長成這樣:
\[ LeakyReLU(x, \alpha) = \left\{ \begin{array} -x & \mbox{ if x > 0} \\ \alpha x & \mbox{ otherwise} \end{array} \right. \]
\[ \frac{\partial}{\partial x}LeakyReLU(x, \alpha) = \left\{ \begin{array} -1 & \mbox{ if x > 0} \\ \alpha & \mbox{ otherwise} \end{array} \right. \]
# Optimizer
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
# Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.LeakyReLU(data = fc1, act.type = 'leaky', slope = 0.25, name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.LeakyReLU(data = fc2, act.type = 'leaky', slope = 0.25, name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 3, name = 'fc3')
softmax_layer = mx.symbol.softmax(data = fc3, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter, ctx = mx.cpu(), save.grad = TRUE,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 300)
– 但這可以作為一種輔助手段以協助傳遞梯度。
PARAMS = model$arg.params
ls(PARAMS)
## [1] "fc1_bias" "fc1_weight" "fc2_bias" "fc2_weight" "fc3_bias"
## [6] "fc3_weight"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
## [,1]
## [1,] 9.996370e-01
## [2,] 3.629570e-04
## [3,] 1.929295e-13
PARAMS = model$arg.params
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
relu1_out = fc1_out
relu1_out[relu1_out < 0] = relu1_out[relu1_out < 0] * 0.25
fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
relu2_out = fc2_out
relu2_out[relu2_out < 0] = relu2_out[relu2_out < 0] * 0.25
fc3_out = relu2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)
Softmax_out = exp(fc3_out)/sum(exp(fc3_out))
cbind(t(Softmax_out), preds)
## [,1] [,2]
## [1,] 9.996370e-01 9.996370e-01
## [2,] 3.629565e-04 3.629570e-04
## [3,] 1.929289e-13 1.929295e-13
– 讓我們稍微整理一下梯度消失問題的解決方案:
改變非線性轉換函數,像是ReLU、LeakyReLU等
數據標準化,像是Batch Normalization
從優化手段上下手,像是使用Adam替代SGD
改寫損失函數,像是殘差平方和到交叉熵、直通通道等
改變網路結構,像是Residual Learning、Dense Connection等
– 我們應該驚訝於深度學習領域的研究進展之快,並且基石級的突破居然出現在如此近代的研究中,這也是為什麼直到近年的第三波人工智慧革命到目前為止都仍然火熱。自從Residual Learning讓1000層的網路變成可行後,讓人工智慧(神經網路)再一次成為了主流,之後的課程我們會先從2012年的AlexNet開始依序介紹幾個在深度學習領域中的經典研究,以進一步學習其中奧妙!